CoExp (Garcia-Ruiz et al., 2021) contains five packages which contain gene co-expression networks constructed using data from four independent databases of tissue-specific transcriptomic data. Frontal cortex is the one tissue common to all of these databases. This markdown contains an exploration of the GTEx version 6 frontal cortex, substantia nigra, putamen and caudate gene co-expression networks downloaded from GitHub. This package contains data from GTEx consortium which provides 53 different human tissues from a total of 544 healthy donors. Each network contains many modules which represent the clusters of coexpression. Relationships between the nine genes of the non-specific (NSL) complex and several other lists of genes of interest (see Genelists.Rmd) were examined.

# First initialise the datasets/packages to use
CoExpGTEx::initDb(mandatory = T)
# Open ensembl
ensembl = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
source(file.path(here::here("scripts", "CoExp_functions.R")))

1. Gene lists of interest

1.1. NSL complex

Firstly, the nine genes encoding the NSL complex will be focused on (Sheikh et al., 2019). These genes encode proteins which make up an acetyltransferase complex linked to both nuclear and mitochondrial gene expression.

# Load gene list
NSL_list <- read.delim(file.path(here::here("data", "Gene_lists", "NSL_list.txt")))
# Make gene list vector
NSL_genes <- as.character(NSL_list$Gene_symbol)

The number of genes in this list: 9.

1.2. PD mendelian

A broad list of highly penetrant genes linked unexclusively to familial forms of Parkinson's disease was obtained from PanelApp Parkinson disease and complex Parkinsonism version 1.68, a list collected and validated by a group of experts.

# Load list
PD_mendelian_list <- read.delim(file.path(here::here("data", "Gene_lists", "PD_mendelian_list.txt")))
# Make gene list vector for CoExp search
PD_mendelian_genes <- as.character(PD_mendelian_list$Gene_symbol)

The number of genes in this list: 43.

1.3. PD sporadic

A list of less penetrant, more common genes linked to sporadic Parkinson's disease was obtained from the latest GWAS, described as "Of the genes possibly associated with at least one QTL in public reference datasets and therefore testable via summary-based Mendelian randomisation, the expression or methylation of 64% was significantly associated with a possible causal change in Parkinson's disease risk."

# Load list
PD_GWAS_list <- read.delim(file.path(here::here("data", "Gene_lists", "PD_GWAS_list.txt")))
# Make gene list vector for CoExp search
PD_GWAS_genes <- as.character(PD_GWAS_list$Gene_symbol)

The number of genes in this list: 150.

2. Frontal cortex gene co-expression network

2.1. Network structure

The overall structure of the network was first examined.

# Display all modules in a specified tissue/network 
CoExpNets::plotModSizes(tissue = "FCortex",
                        which.one = "gtexv6")

# Display network structure
Eigengenes <- getNetworkEigengenes(which.one = "gtexv6", 
                                   tissue = "FCortex")
cat("Frontal cortex network was obtained from", nrow(Eigengenes),
    "samples and has", ncol(Eigengenes), "modules\n")
## Frontal cortex network was obtained from 108 samples and has 17 modules
plotEGClustering(which.one = "gtexv6", 
                 tissue = "FCortex")

# Display correlations between modules
corrplot(cor(Eigengenes), tl.srt = 45,
         tl.col = "black",
         type = "upper", 
         tl.cex = 0.45,
         order = "hclust",
         title = paste0("Eigengene correlations"),
         mar = c(3, 2, 2.5, 4))

2.2. Gene set enrichment analysis

NSL genes

First, the NSL genes were searched in the coexpression network.

# Search genes of interest in specified tissue/network, including previously investigated KANSL1 ens.correction
NSL_report_gt <- reportOnGenes(tissue = "FCortex", 
                               genes = NSL_genes, 
                               which.one = "gtexv6", 
                               ens.correction = c("ENSG00000257382", "ENSG00000120071"))
# Select report
NSL_rep_gt <- NSL_report_gt$report 
# Convert report into dataframe
NSL_rep_gt_length <- nrow(NSL_rep_gt)
NSL_rep_gt <- data.frame(matrix(unlist(NSL_rep_gt), 
                                nrow = NSL_rep_gt_length, 
                                byrow = FALSE), 
                         stringsAsFactors=FALSE)
names(NSL_rep_gt)[1] <- "Gene"
names(NSL_rep_gt)[2] <- "ENSG"
names(NSL_rep_gt)[3] <- "MM"
names(NSL_rep_gt)[4] <- "Module"
names(NSL_rep_gt)[5] <- "Size"
names(NSL_rep_gt)[6] <- "GO_report"
names(NSL_rep_gt)[7] <- "PD_genes"
names(NSL_rep_gt)[8] <- "Preservation"
names(NSL_rep_gt)[9] <- "Cell_type"
names(NSL_rep_gt)[10] <- "Fisher"
# FDR correct p values
NSL_rep_gt$Fisher <- as.numeric(NSL_rep_gt$Fisher)
NSL_rep_gt$FDR <- p.adjust(NSL_rep_gt$Fisher, method="fdr")

The number of NSL genes is 9 and the number of genes in the search output was 9, suggesting there were 0 genes missing from the search.

# Print report without GO term and PD genes columns
NSL_rep_gt[c(1,4,3,5,10,11,9)]
# Store NSL enriched modules
NSL_gt_modules <- unique(NSL_rep_gt$Module)
# Select columns needed for figure
NSL_fig_gt <- data.frame(NSL_rep_gt$Module, NSL_rep_gt$Gene, NSL_rep_gt$FDR)
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.FDR"] <- "FDR"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Module"] <- "Module"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Gene"] <- "Gene"
# Prep for figure 
NSL_fig_gt$FDR <- as.numeric(NSL_fig_gt$FDR)
NSL_fig_gt$Module <- as.character(NSL_fig_gt$Module)
NSL_fig_gt <- dplyr::arrange(NSL_fig_gt, Module)
NSL_fig_gt$`-log10(FDR)` <- -log10(NSL_fig_gt$FDR)
# Produce plot
ggplot(NSL_fig_gt, aes(Gene, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "goldenrod3", limits = c(0,2.4)) +
  labs(y = "Module") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

The NSL genes appeared in modules red, greenyellow, magenta, darkred, grey60 in this GCN but were only significantly enriched in the red module. The eigengenes plot shows modules darkred and red to be relatively close together, as well as modules greenyellow and grey60, suggesting the genes within these pairs of modules may have some level of coexpression.

PD mendelian genes

Then the PD mendelian list was searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network 
PD_mendelian_report_gt <- reportOnGenes(tissue = "FCortex", 
                                        genes = PD_mendelian_genes, 
                                        which.one = "gtexv6")
# Select report
PD_mendelian_rep_gt <- PD_mendelian_report_gt$report 
# Count number of genes in output
PD_mendelian_rep_gt_length <- nrow(PD_mendelian_rep_gt)

The number of PD mendelian genes is 43 and the number of genes in the search output was 39, suggesting there were 4 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_mendelian_genes, PD_mendelian_rep_gt$gene)
## [1] "GBA"    "MAPT"   "PRKN"   "SLC6A3"
# Check if any synonyms of the genes are present in networks
check_synonyms("FCortex", 
               "gtexv6", 
               PD_mendelian_genes, 
               PD_mendelian_rep_gt$gene)

1 gene appeared in the network under an alias name, so this was replaced in the search list.

# Replace this in the list of genes
PD_mendelian_gt_genes <- as.character(PD_mendelian_genes)
PD_mendelian_gt_genes[PD_mendelian_gt_genes=="PRKN"] <- "PARK2"
# If genes still missing, check if ENSGs appear in source files
check_source_files("FCortex",
                   "gtexv6", 
                   PD_mendelian_gt_genes, 
                   PD_mendelian_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "FCortex.resids.rds")))

2 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks 
fromGeneName2Ensembl("GBA")
## [1] "ENSG00000262446"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"

The remaining gene did not appear in the source files under any ENSG from ensembl, thus it was assumed this gene does not appear in this dataset. The search can was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_mendelian_report2_gt <- reportOnGenes(tissue = "FCortex", 
                                         genes = PD_mendelian_gt_genes, 
                                         which.one = "gtexv6", 
                                         ens.correction = c("ENSG00000262446", "ENSG00000177628",
                                                            "ENSG00000257437", "ENSG00000186868"))
# Select report
PD_mendelian_rep2_gt <- PD_mendelian_report2_gt$report 
# Count number of genes in output
PD_mendelian_rep2_gt_length <- nrow(PD_mendelian_rep2_gt)

The number of genes in the final search output was 42, showing there was 1 fewer gene than the original input list.

# Convert report into dataframe
PD_mendelian_rep2_gt <- data.frame(matrix(unlist(PD_mendelian_rep2_gt), 
                                          nrow = PD_mendelian_rep2_gt_length, 
                                          byrow = FALSE), stringsAsFactors=FALSE)
names(PD_mendelian_rep2_gt)[1] <- "Gene"
names(PD_mendelian_rep2_gt)[2] <- "ENSG"
names(PD_mendelian_rep2_gt)[3] <- "MM"
names(PD_mendelian_rep2_gt)[4] <- "Module"
names(PD_mendelian_rep2_gt)[5] <- "Size"
names(PD_mendelian_rep2_gt)[6] <- "GO_report"
names(PD_mendelian_rep2_gt)[7] <- "PD_genes"
names(PD_mendelian_rep2_gt)[8] <- "Preservation"
names(PD_mendelian_rep2_gt)[9] <- "Cell_type"
names(PD_mendelian_rep2_gt)[10] <- "Fisher"
# Filter for NSL gene enriched modules
PD_mendelian_NSL_mod_gt <- PD_mendelian_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes columns
PD_mendelian_NSL_mod_gt[, c(-6, -7, -8)]

PD sporadic genes

The PD sporadic list was also searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network
PD_GWAS_report_gt <- reportOnGenes(tissue = "FCortex", 
                                   genes = PD_GWAS_genes, 
                                   which.one = "gtexv6")
# Select report
PD_GWAS_rep_gt <- PD_GWAS_report_gt$report 
# Count number of genes in output
PD_GWAS_rep_gt_length <- nrow(PD_GWAS_rep_gt)

The number of PD GWAS genes is 150 and the number of genes in the search output was 122, suggesting there were 28 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_GWAS_genes, PD_GWAS_rep_gt$gene)
##  [1] "CCAR2"       "CSTA"        "DNAH17"      "GIN1"        "GPR65"      
##  [6] "HIST1H4J"    "HLA-DQA1"    "HLA-DQA2"    "HLA-DQB1"    "HLA-DRA"    
## [11] "HLA-DRB5"    "HLA-DRB6"    "KLHL7-DT"    "MAPT"        "MMRN1"      
## [16] "NOD2"        "PPIP5K2"     "RAB29"       "RETREG3"     "SGF29"      
## [21] "SLC4A1"      "TRIM10"      "TRIM31"      "TRIM40"      "VKORC1"     
## [26] "WDR5B"       "ZNF668"      "ZSCAN16-AS1"
# Check if any synonyms of the genes are present in networks
check_synonyms("FCortex", 
               "gtexv6", 
               PD_GWAS_genes, 
               PD_GWAS_rep_gt$gene)

5 genes appeared in the network under alias names, so these were replaced in the search list.

# Replace this in the list of genes
PD_GWAS_gt_genes <- as.character(PD_GWAS_genes)
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="CCAR2"] <- "KIAA1967"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="KLHL7-DT"] <- "KLHL7-AS1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RAB29"] <- "RAB7L1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RETREG3"] <- "FAM134C"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="SGF29"] <- "CCDC101"
# If genes still missing, check if ENSGs appear in source files
check_source_files("FCortex", 
                   "gtexv6", 
                   PD_GWAS_gt_genes, 
                   PD_GWAS_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "FCortex.resids.rds")))

8 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks
fromGeneName2Ensembl("GIN1")
## [1] "ENSG00000263031"
fromGeneName2Ensembl("HLA-DQB1")
## [1] "ENSG00000233209"
fromGeneName2Ensembl("HLA-DRA")
## [1] "ENSG00000228987"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"
fromGeneName2Ensembl("PPIP5K2")
## [1] "ENSG00000262234"
fromGeneName2Ensembl("VKORC1")
## [1] "ENSG00000255439"
fromGeneName2Ensembl("ZNF668")
## [1] "ENSG00000232748"
fromGeneName2Ensembl("ZSCAN16-AS1")
## [1] "ZSCAN16-AS1"
# Return gene name that is attached to the correct ENSG of this gene
fromEnsembl2GeneName("ENSG00000269293")
## [1] "RP1-265C24.9"
# Replace this in the search list
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="ZSCAN16-AS1"] <- "RP1-265C24.9"

One of the above genes, ZSCAN16-AS1, could not be identified through the method of searching fromGeneName2Ensembl, but the ENSG was found instead using fromEnsembl2GeneName to be assigned to the gene name RP1-265C24.9, so this was replaced in the input list. The remaining genes did not appear in the source files under any ENSG from ensembl, thus it was assumed these genes do not appear in this dataset. A quick check on GTEx itself for these genes displayed very low levels in the frontal cortex. The search was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_GWAS_report2_gt <- reportOnGenes(tissue = "FCortex", 
                                    genes = PD_GWAS_gt_genes, 
                                    which.one = "gtexv6", 
                                    ens.correction = c("ENSG00000263031", "ENSG00000145723",
                                                       "ENSG00000233209", "ENSG00000179344",
                                                       "ENSG00000228987", "ENSG00000204287",
                                                       "ENSG00000257437", "ENSG00000186868",
                                                       "ENSG00000262234", "ENSG00000145725", 
                                                       "ENSG00000255439", "ENSG00000167397",
                                                       "ENSG00000232748", "ENSG00000167394"))
# Select report
PD_GWAS_rep2_gt <- PD_GWAS_report2_gt$report 
# Count number of genes in output
PD_GWAS_rep2_gt_length <- nrow(PD_GWAS_rep2_gt)

The number of genes in the final search output was 135, showing there were 15 fewer genes than the original input list.

# Convert report into dataframe
PD_GWAS_rep2_gt <- data.frame(matrix(unlist(PD_GWAS_rep2_gt), 
                                     nrow = PD_GWAS_rep2_gt_length, 
                                     byrow = FALSE), 
                              stringsAsFactors=FALSE)
names(PD_GWAS_rep2_gt)[1] <- "Gene"
names(PD_GWAS_rep2_gt)[2] <- "ENSG"
names(PD_GWAS_rep2_gt)[3] <- "MM"
names(PD_GWAS_rep2_gt)[4] <- "Module"
names(PD_GWAS_rep2_gt)[5] <- "Size"
names(PD_GWAS_rep2_gt)[6] <- "GO_report"
names(PD_GWAS_rep2_gt)[7] <- "PD_genes"
names(PD_GWAS_rep2_gt)[8] <- "Preservation"
names(PD_GWAS_rep2_gt)[9] <- "Cell_type"
names(PD_GWAS_rep2_gt)[10] <- "Fisher"
# Filter for NSL modules
PD_GWAS_NSL_mod_gt <- PD_GWAS_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes column
PD_GWAS_NSL_mod_gt[, c(-6, -7, -8)]

Gene ontology terms

To examine the functional characteristics of these modules, the GO term enrichments annotated to these modules were extracted and simplified according to parent terms.

# Obtain gprofiler2 annotations from the CoExpGTEx frontal cortex source files
e = read.csv(file.path(here::here("data", 
                                  "CoExp_source", 
                                  "netFCortex.9.it.50.rds_gprof.csv")), 
             stringsAsFactors = FALSE)
# Extract all GO term enrichments from source file for modules of interest
NSL_GO = e[e$query.number %in% NSL_gt_modules & e$domain %in% c("BP", "CC", "MF"),]
NSL_GO_2 <- dplyr::select(NSL_GO, query.number, term.id, term.name, domain, p.value)
# Rename columns
names(NSL_GO_2)[names(NSL_GO_2)=="query.number"] <- "gene_list"
names(NSL_GO_2)[names(NSL_GO_2)=="term.id"] <- "go_id"
names(NSL_GO_2)[names(NSL_GO_2)=="domain"] <- "go_type"
names(NSL_GO_2)[names(NSL_GO_2)=="term.name"] <- "go_term"
# FDR correct p values
NSL_GO_2$p.value <- as.numeric(NSL_GO_2$p.value)
NSL_GO_2$FDR <- p.adjust(NSL_GO_2$p.value, method="fdr")
# GO reduce all terms to parent terms
NSL_GO_red <- go_reduce(pathway_df = NSL_GO_2, 
                        threshold = 0.7,
                        scores = NULL)
# Format
NSL_GO_red$go_type[NSL_GO_red$go_type=="BP"] <- "GO: BP"
NSL_GO_red$go_type[NSL_GO_red$go_type=="MF"] <- "GO: MF"
NSL_GO_red$go_type[NSL_GO_red$go_type=="CC"] <- "GO: CC"
NSL_GO_red <- NSL_GO_red %>%
  tidyr::drop_na(parent_term) %>%
  dplyr::arrange(desc(FDR))
# Plot
ggplot2::ggplot(NSL_GO_red, aes(x = gene_list, y = parent_term)) + 
  geom_point(size = 2, aes(colour = -log10(`FDR`))) +
  scale_color_continuous(low = "lightgoldenrod", high = "red4", limits = c(0,58)) +
  facet_grid(go_type ~., scales = "free", space = "free") +
  labs(x = "Module", y = "") +
  theme(
    strip.text.y = element_text(angle = 90, size = 6),
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm")
    )

2.2. Summary

The frequency of genes within each gene list apppearing in each of the 5 NSL enriched modules was calculated.

# Collate module results into one table
gt1 <- PD_mendelian_NSL_mod_gt['Module']
gt1['Gene_list'] <- "PD_mendelian"
gt2 <- PD_GWAS_NSL_mod_gt['Module']
gt2['Gene_list'] <- "PD_GWAS"
gt_mods_list <- do.call(rbind, list(gt1, gt2))
# Display as frequency of each module within each list
gt_mod_freq <- with(gt_mods_list, table(Module, Gene_list))
gt_mod_freq <- as.data.frame.matrix(gt_mod_freq)
knitr::kable(gt_mod_freq[c(5,1,4,2,3),])
PD_GWAS PD_mendelian
red 19 4
darkred 9 3
magenta 5 1
greenyellow 3 2
grey60 6 0

The significance of the enrichments of each gene list within each of the 5 NSL enriched modules was also calculated. P-values were FDR-corrected before summarising.

# FDR correct p values
PD_mendelian_rep2_gt$FDR <- p.adjust(PD_mendelian_rep2_gt$Fisher, method = "fdr")
PD_GWAS_rep2_gt$FDR <- p.adjust(PD_GWAS_rep2_gt$Fisher, method = "fdr")
# Collate module results into one table
gts1 <- data.frame(PD_mendelian_rep2_gt$Module, PD_mendelian_rep2_gt$FDR)
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.FDR"] <- "FDR"
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.Module"] <- "Module"
gts1$Gene_list <- "PD_mendelian"
gts2 <- data.frame(PD_GWAS_rep2_gt$Module, PD_GWAS_rep2_gt$FDR)
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.FDR"] <- "FDR"
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.Module"] <- "Module"
gts2$Gene_list <- "PD_GWAS"
gt_pvals_list <- dplyr::bind_rows(gts1, gts2)
# Display as true/false table
gt_TF <- gt_pvals_list %>% 
  dplyr::distinct() %>%
  tidyr::spread(Gene_list, FDR)
gt_TF <- dplyr::filter(gt_TF, Module %in% NSL_gt_modules)
gt_TF <- tibble::column_to_rownames(gt_TF, var = "Module")
knitr::kable(gt_TF[c(5,1,4,2,3),])
PD_GWAS PD_mendelian
red 0.0294584 0.5111400
darkred 0.8170991 0.7668182
magenta 0.9431929 0.9228000
greenyellow 0.9975000 0.8769600
grey60 0.9431929 NA
# Filter out modules without NSL genes
melt_summ <- gt_pvals_list %>%
  dplyr::filter(Module %in% NSL_gt_modules) %>%
  dplyr::distinct()
# Prep for figure 
melt_summ$Gene_list[melt_summ$Gene_list=="PD_mendelian"] <- "Mendelian PD"
melt_summ$Gene_list[melt_summ$Gene_list=="PD_GWAS"] <- "Sporadic PD"
melt_summ$FDR <- as.numeric(melt_summ$FDR)
melt_summ$Module <- as.character(melt_summ$Module)
melt_summ$Gene_list <- as.character(melt_summ$Gene_list)
melt_summ <- dplyr::arrange(melt_summ, Module)
melt_summ$`-log10(FDR)` <- -log10(melt_summ$FDR)
# Plot
ggplot(melt_summ, aes(Gene_list, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "lightsteelblue4", limits = c(0, 2.4)) +
  labs(y = "Module", x = "Gene list") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

3. Substantia nigra gene co-expression network

3.1. Network structure

The overall structure of the network was first examined.

# Display all modules in a specified tissue/network 
CoExpNets::plotModSizes(tissue = "Substantianigra",
                        which.one = "gtexv6")

# Display network structure
Eigengenes <- getNetworkEigengenes(which.one = "gtexv6", 
                                   tissue = "Substantianigra")
cat("Substantia nigra network was obtained from", nrow(Eigengenes),
    "samples and has", ncol(Eigengenes), "modules\n")
## Substantia nigra network was obtained from 63 samples and has 37 modules
plotEGClustering(which.one = "gtexv6", 
                 tissue = "Substantianigra")

# Display correlations between modules
corrplot(cor(Eigengenes), tl.srt = 45,
         tl.col = "black",
         type = "upper", 
         tl.cex = 0.45,
         order = "hclust",
         title = paste0("Eigengene correlations"),
         mar = c(3, 2, 2.5, 4))

3.2. Gene set enrichment analysis

NSL genes

First, the NSL genes were searched in the coexpression network.

# Search genes of interest in specified tissue/network, including previously investigated KANSL1 ens.correction
NSL_report_gt <- reportOnGenes(tissue = "Substantianigra", 
                               genes = NSL_genes, 
                               which.one = "gtexv6", 
                               ens.correction = c("ENSG00000257382", "ENSG00000120071"))
# Select report
NSL_rep_gt <- NSL_report_gt$report 
# Convert report into dataframe
NSL_rep_gt_length <- nrow(NSL_rep_gt)
NSL_rep_gt <- data.frame(matrix(unlist(NSL_rep_gt), 
                                nrow = NSL_rep_gt_length, 
                                byrow = FALSE), stringsAsFactors=FALSE)
names(NSL_rep_gt)[1] <- "Gene"
names(NSL_rep_gt)[2] <- "ENSG"
names(NSL_rep_gt)[3] <- "MM"
names(NSL_rep_gt)[4] <- "Module"
names(NSL_rep_gt)[5] <- "Size"
names(NSL_rep_gt)[6] <- "GO_report"
names(NSL_rep_gt)[7] <- "PD_genes"
names(NSL_rep_gt)[8] <- "Preservation"
names(NSL_rep_gt)[9] <- "Cell_type"
names(NSL_rep_gt)[10] <- "Fisher"
# FDR correct p values
NSL_rep_gt$Fisher <- as.numeric(NSL_rep_gt$Fisher)
NSL_rep_gt$FDR <- p.adjust(NSL_rep_gt$Fisher, method="fdr")

The number of NSL genes is 9 and the number of genes in the search output was 9, suggesting there were 0 genes missing from the search.

# Print report without GO term and PD genes columns
NSL_rep_gt[,c(1,4,3,5,10,11,9)]
# Store NSL enriched modules
NSL_gt_modules <- unique(NSL_rep_gt$Module)
# Select columns needed for figure
NSL_fig_gt <- data.frame(NSL_rep_gt$Module, NSL_rep_gt$Gene, NSL_rep_gt$FDR)
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.FDR"] <- "FDR"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Module"] <- "Module"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Gene"] <- "Gene"
# Prep for figure 
NSL_fig_gt$FDR <- as.numeric(NSL_fig_gt$FDR)
NSL_fig_gt$Module <- as.character(NSL_fig_gt$Module)
NSL_fig_gt <- dplyr::arrange(NSL_fig_gt, Module)
NSL_fig_gt$`-log10(FDR)` <- -log10(NSL_fig_gt$FDR)
# Produce plot
ggplot(NSL_fig_gt, aes(Gene, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "goldenrod3", limits = c(0,2.4)) +
  labs(y = "Module") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

The NSL genes appeared in modules floralwhite, darkorange, mediumpurple3, plum2, greenyellow, bisque4, black in this GCN and were not signifcantly enriched in any module. The eigengenes plot shows modules dark orange and mediumpurple3 to be relatively close together, as well as modules greenyellow, bisque4 and black, suggesting the genes within these pairs of modules may have some level of coexpression.

PD mendelian genes

Then the PD mendelian list was searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network 
PD_mendelian_report_gt <- reportOnGenes(tissue = "Substantianigra", 
                                        genes = PD_mendelian_genes, 
                                        which.one = "gtexv6")
# Select report
PD_mendelian_rep_gt <- PD_mendelian_report_gt$report 
# Count number of genes in output
PD_mendelian_rep_gt_length <- nrow(PD_mendelian_rep_gt)

The number of PD mendelian genes is 43 and the number of genes in the search output was 40, suggesting there were 3 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_mendelian_genes, PD_mendelian_rep_gt$gene)
## [1] "GBA"  "MAPT" "PRKN"
# Check if any synonyms of the genes are present in networks
check_synonyms("Substantianigra", 
               "gtexv6", 
               PD_mendelian_genes, 
               PD_mendelian_rep_gt$gene)

1 gene appeared in the network under an alias name, so this was replaced in the search list.

# Replace this in the list of genes
PD_mendelian_gt_genes <- as.character(PD_mendelian_genes)
PD_mendelian_gt_genes[PD_mendelian_gt_genes=="PRKN"] <- "PARK2"
# If genes still missing, check if ENSGs appear in source files
check_source_files("Substantianigra", 
                   "gtexv6", 
                   PD_mendelian_gt_genes, 
                   PD_mendelian_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "Substantianigra.resids.rds")))

2 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks 
fromGeneName2Ensembl("GBA")
## [1] "ENSG00000262446"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"

The remaining gene did not appear in the source files under any ENSG from ensembl, thus it was assumed this gene does not appear in this dataset. The search can was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_mendelian_report2_gt <- reportOnGenes(tissue = "Substantianigra", 
                                         genes = PD_mendelian_gt_genes, 
                                         which.one = "gtexv6", 
                                         ens.correction = c("ENSG00000262446", "ENSG00000177628",
                                                            "ENSG00000257437", "ENSG00000186868"))
# Select report
PD_mendelian_rep2_gt <- PD_mendelian_report2_gt$report 
# Count number of genes in output
PD_mendelian_rep2_gt_length <- nrow(PD_mendelian_rep2_gt)

The number of genes in the final search output was 43, showing there was 0 fewer gene than the original input list.

# Convert report into dataframe
PD_mendelian_rep2_gt <- data.frame(matrix(unlist(PD_mendelian_rep2_gt), 
                                          nrow = PD_mendelian_rep2_gt_length, 
                                          byrow = FALSE), stringsAsFactors=FALSE)
names(PD_mendelian_rep2_gt)[1] <- "Gene"
names(PD_mendelian_rep2_gt)[2] <- "ENSG"
names(PD_mendelian_rep2_gt)[3] <- "MM"
names(PD_mendelian_rep2_gt)[4] <- "Module"
names(PD_mendelian_rep2_gt)[5] <- "Size"
names(PD_mendelian_rep2_gt)[6] <- "GO_report"
names(PD_mendelian_rep2_gt)[7] <- "PD_genes"
names(PD_mendelian_rep2_gt)[8] <- "Preservation"
names(PD_mendelian_rep2_gt)[9] <- "Cell_type"
names(PD_mendelian_rep2_gt)[10] <- "Fisher"
# Filter for NSL gene enriched modules
PD_mendelian_NSL_mod_gt <- PD_mendelian_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes columns
PD_mendelian_NSL_mod_gt[, c(1,4,3,5,10,9)]

PD sporadic genes

The PD sporadic list was also searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network
PD_GWAS_report_gt <- reportOnGenes(tissue = "Substantianigra",
                                   genes = PD_GWAS_genes, 
                                   which.one = "gtexv6")
# Select report
PD_GWAS_rep_gt <- PD_GWAS_report_gt$report 
# Count number of genes in output
PD_GWAS_rep_gt_length <- nrow(PD_GWAS_rep_gt)

The number of PD GWAS genes is 150 and the number of genes in the search output was 128, suggesting there were 22 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_GWAS_genes, PD_GWAS_rep_gt$gene)
##  [1] "CCAR2"       "GIN1"        "HIST1H4J"    "HLA-DQA1"    "HLA-DQA2"   
##  [6] "HLA-DQB1"    "HLA-DRA"     "HLA-DRB6"    "KLHL7-DT"    "MAPT"       
## [11] "PPIP5K2"     "RAB29"       "RETREG3"     "SGF29"       "SLC4A1"     
## [16] "TRIM10"      "TRIM31"      "TRIM40"      "VKORC1"      "WDR5B"      
## [21] "ZNF668"      "ZSCAN16-AS1"
# Check if any synonyms of the genes are present in networks
check_synonyms("Substantianigra", 
               "gtexv6",
               PD_GWAS_genes, 
               PD_GWAS_rep_gt$gene)

5 genes appeared in the network under alias names, so these were replaced in the search list.

# Replace this in the list of genes
PD_GWAS_gt_genes <- as.character(PD_GWAS_genes)
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="CCAR2"] <- "KIAA1967"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="KLHL7-DT"] <- "KLHL7-AS1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RAB29"] <- "RAB7L1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RETREG3"] <- "FAM134C"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="SGF29"] <- "CCDC101"
# If genes still missing, check if ENSGs appear in source files
check_source_files("Substantianigra", 
                   "gtexv6", 
                   PD_GWAS_gt_genes, 
                   PD_GWAS_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "Substantianigra.resids.rds")))

8 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks
fromGeneName2Ensembl("GIN1")
## [1] "ENSG00000263031"
fromGeneName2Ensembl("HLA-DQB1")
## [1] "ENSG00000233209"
fromGeneName2Ensembl("HLA-DRA")
## [1] "ENSG00000228987"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"
fromGeneName2Ensembl("PPIP5K2")
## [1] "ENSG00000262234"
fromGeneName2Ensembl("VKORC1")
## [1] "ENSG00000255439"
fromGeneName2Ensembl("ZNF668")
## [1] "ENSG00000232748"
fromGeneName2Ensembl("ZSCAN16-AS1")
## [1] "ZSCAN16-AS1"
# Return gene name that is attached to the correct ENSG of this gene
fromEnsembl2GeneName("ENSG00000269293")
## [1] "RP1-265C24.9"
# Replace this in the search list
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="ZSCAN16-AS1"] <- "RP1-265C24.9"

One of the above genes, ZSCAN16-AS1, could not be identified through the method of searching fromGeneName2Ensembl, but the ENSG was found instead using fromEnsembl2GeneName to be assigned to the gene name RP1-265C24.9, so this was replaced in the input list. The remaining genes did not appear in the source files under any ENSG from ensembl, thus it was assumed these genes do not appear in this dataset. A quick check on GTEx itself for these genes displayed very low levels in the frontal cortex. The search was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_GWAS_report2_gt <- reportOnGenes(tissue = "Substantianigra", 
                                    genes = PD_GWAS_gt_genes, 
                                    which.one = "gtexv6", 
                                    ens.correction = c("ENSG00000263031", "ENSG00000145723",
                                                       "ENSG00000233209", "ENSG00000179344",
                                                       "ENSG00000228987", "ENSG00000204287",
                                                       "ENSG00000257437", "ENSG00000186868",
                                                       "ENSG00000262234", "ENSG00000145725", 
                                                       "ENSG00000255439", "ENSG00000167397",
                                                       "ENSG00000232748", "ENSG00000167394"))
# Select report
PD_GWAS_rep2_gt <- PD_GWAS_report2_gt$report 
# Count number of genes in output
PD_GWAS_rep2_gt_length <- nrow(PD_GWAS_rep2_gt)

The number of genes in the final search output was 141, showing there were 9 fewer genes than the original input list.

# Convert report into dataframe
PD_GWAS_rep2_gt <- data.frame(matrix(unlist(PD_GWAS_rep2_gt), 
                                     nrow = PD_GWAS_rep2_gt_length, 
                                     byrow = FALSE), 
                              stringsAsFactors=FALSE)
names(PD_GWAS_rep2_gt)[1] <- "Gene"
names(PD_GWAS_rep2_gt)[2] <- "ENSG"
names(PD_GWAS_rep2_gt)[3] <- "MM"
names(PD_GWAS_rep2_gt)[4] <- "Module"
names(PD_GWAS_rep2_gt)[5] <- "Size"
names(PD_GWAS_rep2_gt)[6] <- "GO_report"
names(PD_GWAS_rep2_gt)[7] <- "PD_genes"
names(PD_GWAS_rep2_gt)[8] <- "Preservation"
names(PD_GWAS_rep2_gt)[9] <- "Cell_type"
names(PD_GWAS_rep2_gt)[10] <- "Fisher"
# Filter for NSL modules
PD_GWAS_NSL_mod_gt <- PD_GWAS_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes column
PD_GWAS_NSL_mod_gt[, c(1,4,3,5,10,9)]

Gene ontology terms

To examine the functional characteristics of these modules, the GO term enrichments annotated to these modules were extracted and simplified according to parent terms.

# Obtain gprofiler2 annotations from the CoExpGTEx frontal cortex source files
e = read.csv(file.path(here::here("data", 
                                  "CoExp_source", 
                                  "netSubstantianigra.25.it.50.rds_gprof.csv")), 
             stringsAsFactors = FALSE)
# Extract all GO term enrichments from source file for modules of interest
NSL_GO = e[e$query.number %in% NSL_gt_modules & e$domain %in% c("BP", "CC", "MF"),]
NSL_GO_2 <- dplyr::select(NSL_GO, query.number, term.id, term.name, domain, p.value)
# Rename columns
names(NSL_GO_2)[names(NSL_GO_2)=="query.number"] <- "gene_list"
names(NSL_GO_2)[names(NSL_GO_2)=="term.id"] <- "go_id"
names(NSL_GO_2)[names(NSL_GO_2)=="domain"] <- "go_type"
names(NSL_GO_2)[names(NSL_GO_2)=="term.name"] <- "go_term"
# FDR correct p values
NSL_GO_2$p.value <- as.numeric(NSL_GO_2$p.value)
NSL_GO_2$FDR <- p.adjust(NSL_GO_2$p.value, method="fdr")
# GO reduce all terms to parent terms
NSL_GO_red <- go_reduce(pathway_df = NSL_GO_2, 
                        threshold = 0.7,
                        scores = NULL)
# Format
NSL_GO_red$go_type[NSL_GO_red$go_type=="BP"] <- "GO: BP"
NSL_GO_red$go_type[NSL_GO_red$go_type=="MF"] <- "GO: MF"
NSL_GO_red$go_type[NSL_GO_red$go_type=="CC"] <- "GO: CC"
NSL_GO_red <- NSL_GO_red %>%
  tidyr::drop_na(parent_term) %>%
  dplyr::arrange(desc(FDR))
# Plot
ggplot2::ggplot(NSL_GO_red, aes(x = gene_list, y = parent_term)) + 
  geom_point(size = 2, aes(colour = -log10(`FDR`))) +
  scale_color_continuous(low = "lightgoldenrod", high = "red4", limits = c(0,58)) +
  facet_grid(go_type ~., scales = "free", space = "free") +
  labs(x = "Module", y = "") +
  theme(
    strip.text.y = element_text(angle = 90, size = 6),
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm")
    )

3.2. Summary

The frequency of genes within each gene list apppearing in each of the 7 NSL enriched modules was calculated.

# Collate module results into one table
gt1 <- PD_mendelian_NSL_mod_gt['Module']
gt1['Gene_list'] <- "PD_mendelian"
gt2 <- PD_GWAS_NSL_mod_gt['Module']
gt2['Gene_list'] <- "PD_GWAS"
gt_mods_list <- do.call(rbind, list(gt1, gt2))
# Display as frequency of each module within each list
gt_mod_freq <- with(gt_mods_list, table(Module, Gene_list))
gt_mod_freq <- as.data.frame.matrix(gt_mod_freq)
knitr::kable(gt_mod_freq)
PD_GWAS PD_mendelian
bisque4 5 2
black 2 0
darkorange 4 3
floralwhite 13 4
greenyellow 9 1
mediumpurple3 1 0
plum2 3 0

The significance of the enrichments of each gene list within each of the 7 NSL enriched modules was also calculated. P-values were FDR-corrected before summarising.

# FDR correct p values
PD_mendelian_rep2_gt$FDR <- p.adjust(PD_mendelian_rep2_gt$Fisher, method = "fdr")
PD_GWAS_rep2_gt$FDR <- p.adjust(PD_GWAS_rep2_gt$Fisher, method = "fdr")
# Collate module results into one table
gts1 <- data.frame(PD_mendelian_rep2_gt$Module, PD_mendelian_rep2_gt$FDR)
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.FDR"] <- "FDR"
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.Module"] <- "Module"
gts1$Gene_list <- "PD_mendelian"
gts2 <- data.frame(PD_GWAS_rep2_gt$Module, PD_GWAS_rep2_gt$FDR)
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.FDR"] <- "FDR"
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.Module"] <- "Module"
gts2$Gene_list <- "PD_GWAS"
gt_pvals_list <- dplyr::bind_rows(gts1, gts2)
# Display as true/false table
gt_TF <- gt_pvals_list %>% 
  dplyr::distinct() %>%
  tidyr::spread(Gene_list, FDR)
gt_TF <- dplyr::filter(gt_TF, Module %in% NSL_gt_modules)
gt_TF <- tibble::column_to_rownames(gt_TF, var = "Module")
knitr::kable(gt_TF)
PD_GWAS PD_mendelian
bisque4 0.5485244 0.4294452
black 0.9621176 NA
darkorange 0.8467960 0.3390120
floralwhite 0.0557624 0.2306364
greenyellow 0.3321864 0.8420000
mediumpurple3 0.9885107 NA
plum2 0.8467960 NA
# Filter out modules without NSL genes
melt_summ <- gt_pvals_list %>%
  dplyr::filter(Module %in% NSL_gt_modules) %>%
  dplyr::distinct()
# Prep for figure 
melt_summ$Gene_list[melt_summ$Gene_list=="PD_mendelian"] <- "Mendelian PD"
melt_summ$Gene_list[melt_summ$Gene_list=="PD_GWAS"] <- "Sporadic PD"
melt_summ$FDR <- as.numeric(melt_summ$FDR)
melt_summ$Module <- as.character(melt_summ$Module)
melt_summ$Gene_list <- as.character(melt_summ$Gene_list)
melt_summ <- dplyr::arrange(melt_summ, Module)
melt_summ$`-log10(FDR)` <- -log10(melt_summ$FDR)
# Plot
ggplot(melt_summ, aes(Gene_list, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "lightsteelblue4", limits = c(0, 2.4)) +
  labs(y = "Module", x = "Gene list") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

4. Caudate gene co-expression network

4.1. Network structure

The overall structure of the network was first examined.

# Display all modules in a specified tissue/network 
CoExpNets::plotModSizes(tissue = "Caudate",
                        which.one = "gtexv6")

# Display network structure
Eigengenes <- getNetworkEigengenes(which.one = "gtexv6", 
                                   tissue = "Caudate")
cat("Caudate network was obtained from", nrow(Eigengenes),
    "samples and has", ncol(Eigengenes), "modules\n")
## Caudate network was obtained from 117 samples and has 17 modules
plotEGClustering(which.one = "gtexv6", 
                 tissue = "Caudate")

# Display correlations between modules
corrplot(cor(Eigengenes), tl.srt = 45,
         tl.col = "black",
         type = "upper", 
         tl.cex = 0.45,
         order = "hclust",
         title = paste0("Eigengene correlations"),
         mar = c(3, 2, 2.5, 4))

4.2. Gene set enrichment analysis

NSL genes

First, the NSL genes were searched in the coexpression network.

# Search genes of interest in specified tissue/network, including previously investigated KANSL1 ens.correction
NSL_report_gt <- reportOnGenes(tissue = "Caudate", 
                               genes = NSL_genes, 
                               which.one = "gtexv6", 
                               ens.correction = c("ENSG00000257382", "ENSG00000120071"))
# Select report
NSL_rep_gt <- NSL_report_gt$report 
# Convert report into dataframe
NSL_rep_gt_length <- nrow(NSL_rep_gt)
NSL_rep_gt <- data.frame(matrix(unlist(NSL_rep_gt), 
                                nrow = NSL_rep_gt_length, 
                                byrow = FALSE), stringsAsFactors=FALSE)
names(NSL_rep_gt)[1] <- "Gene"
names(NSL_rep_gt)[2] <- "ENSG"
names(NSL_rep_gt)[3] <- "MM"
names(NSL_rep_gt)[4] <- "Module"
names(NSL_rep_gt)[5] <- "Size"
names(NSL_rep_gt)[6] <- "GO_report"
names(NSL_rep_gt)[7] <- "PD_genes"
names(NSL_rep_gt)[8] <- "Preservation"
names(NSL_rep_gt)[9] <- "Cell_type"
names(NSL_rep_gt)[10] <- "Fisher"
# FDR correct p values
NSL_rep_gt$Fisher <- as.numeric(NSL_rep_gt$Fisher)
NSL_rep_gt$FDR <- p.adjust(NSL_rep_gt$Fisher, method="fdr")

The number of NSL genes is 9 and the number of genes in the search output was 9, suggesting there were 0 genes missing from the search.

# Print report without GO term and PD genes columns
NSL_rep_gt[c(1,4,3,5,10,11,9)]
# Store NSL enriched modules
NSL_gt_modules <- unique(NSL_rep_gt$Module)
# Select columns needed for figure
NSL_fig_gt <- data.frame(NSL_rep_gt$Module, NSL_rep_gt$Gene, NSL_rep_gt$FDR)
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.FDR"] <- "FDR"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Module"] <- "Module"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Gene"] <- "Gene"
# Prep for figure 
NSL_fig_gt$FDR <- as.numeric(NSL_fig_gt$FDR)
NSL_fig_gt$Module <- as.character(NSL_fig_gt$Module)
NSL_fig_gt <- dplyr::arrange(NSL_fig_gt, Module)
NSL_fig_gt$`-log10(FDR)` <- -log10(NSL_fig_gt$FDR)
# Produce plot
ggplot(NSL_fig_gt, aes(Gene, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "goldenrod3", limits = c(0,2.4)) +
  labs(y = "Module") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

The NSL genes appeared in modules salmon, lightcyan, lightgreen, midnightblue, darkred, blue in this GCN but only reached nominal significance in the salmon module. The eigengenes plot shows modules darkred and midnightblue to be relatively close together, as well as modules lightgreen and blue, suggesting the genes within these pairs of modules may have some level of coexpression.

PD mendelian genes

Then the PD mendelian list was searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network 
PD_mendelian_report_gt <- reportOnGenes(tissue = "Caudate", 
                                        genes = PD_mendelian_genes, 
                                        which.one = "gtexv6")
# Select report
PD_mendelian_rep_gt <- PD_mendelian_report_gt$report 
# Count number of genes in output
PD_mendelian_rep_gt_length <- nrow(PD_mendelian_rep_gt)

The number of PD mendelian genes is 43 and the number of genes in the search output was 39, suggesting there were 4 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_mendelian_genes, PD_mendelian_rep_gt$gene)
## [1] "GBA"    "MAPT"   "PRKN"   "SLC6A3"
# Check if any synonyms of the genes are present in networks
check_synonyms("Caudate", 
               "gtexv6", 
               PD_mendelian_genes, 
               PD_mendelian_rep_gt$gene)

1 gene appeared in the network under an alias name, so this was replaced in the search list.

# Replace this in the list of genes
PD_mendelian_gt_genes <- as.character(PD_mendelian_genes)
PD_mendelian_gt_genes[PD_mendelian_gt_genes=="PRKN"] <- "PARK2"
# If genes still missing, check if ENSGs appear in source files
check_source_files("Caudate", 
                   "gtexv6", 
                   PD_mendelian_gt_genes, 
                   PD_mendelian_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "Caudate.resids.rds")))

2 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks 
fromGeneName2Ensembl("GBA")
## [1] "ENSG00000262446"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"

The remaining gene did not appear in the source files under any ENSG from ensembl, thus it was assumed this gene does not appear in this dataset. The search can was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_mendelian_report2_gt <- reportOnGenes(tissue = "Caudate", 
                                         genes = PD_mendelian_gt_genes, 
                                         which.one = "gtexv6", 
                                         ens.correction = c("ENSG00000262446", "ENSG00000177628",
                                                            "ENSG00000257437", "ENSG00000186868"))
# Select report
PD_mendelian_rep2_gt <- PD_mendelian_report2_gt$report 
# Count number of genes in output
PD_mendelian_rep2_gt_length <- nrow(PD_mendelian_rep2_gt)

The number of genes in the final search output was 42, showing there was 1 fewer gene than the original input list.

# Convert report into dataframe
PD_mendelian_rep2_gt <- data.frame(matrix(unlist(PD_mendelian_rep2_gt), 
                                          nrow = PD_mendelian_rep2_gt_length, 
                                          byrow = FALSE), stringsAsFactors=FALSE)
names(PD_mendelian_rep2_gt)[1] <- "Gene"
names(PD_mendelian_rep2_gt)[2] <- "ENSG"
names(PD_mendelian_rep2_gt)[3] <- "MM"
names(PD_mendelian_rep2_gt)[4] <- "Module"
names(PD_mendelian_rep2_gt)[5] <- "Size"
names(PD_mendelian_rep2_gt)[6] <- "GO_report"
names(PD_mendelian_rep2_gt)[7] <- "PD_genes"
names(PD_mendelian_rep2_gt)[8] <- "Preservation"
names(PD_mendelian_rep2_gt)[9] <- "Cell_type"
names(PD_mendelian_rep2_gt)[10] <- "Fisher"
# Filter for NSL gene enriched modules
PD_mendelian_NSL_mod_gt <- PD_mendelian_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes columns
PD_mendelian_NSL_mod_gt[, c(1,4,3,5,10,9)]

PD sporadic genes

The PD sporadic list was also searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network
PD_GWAS_report_gt <- reportOnGenes(tissue = "Caudate",
                                   genes = PD_GWAS_genes, 
                                   which.one = "gtexv6")
# Select report
PD_GWAS_rep_gt <- PD_GWAS_report_gt$report 
# Count number of genes in output
PD_GWAS_rep_gt_length <- nrow(PD_GWAS_rep_gt)

The number of PD GWAS genes is 150 and the number of genes in the search output was 126, suggesting there were 24 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_GWAS_genes, PD_GWAS_rep_gt$gene)
##  [1] "CCAR2"       "CSTA"        "GIN1"        "HLA-DQA1"    "HLA-DQA2"   
##  [6] "HLA-DQB1"    "HLA-DRA"     "HLA-DRB6"    "KLHL7-DT"    "MAPT"       
## [11] "MMRN1"       "NOD2"        "PPIP5K2"     "RAB29"       "RETREG3"    
## [16] "SGF29"       "SLC4A1"      "TRIM10"      "TRIM31"      "TRIM40"     
## [21] "VKORC1"      "WDR5B"       "ZNF668"      "ZSCAN16-AS1"
# Check if any synonyms of the genes are present in networks
check_synonyms("Caudate", 
               "gtexv6",
               PD_GWAS_genes, 
               PD_GWAS_rep_gt$gene)

5 genes appeared in the network under alias names, so these were replaced in the search list.

# Replace this in the list of genes
PD_GWAS_gt_genes <- as.character(PD_GWAS_genes)
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="CCAR2"] <- "KIAA1967"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="KLHL7-DT"] <- "KLHL7-AS1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RAB29"] <- "RAB7L1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RETREG3"] <- "FAM134C"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="SGF29"] <- "CCDC101"
# If genes still missing, check if ENSGs appear in source files
check_source_files("Caudate", 
                   "gtexv6", 
                   PD_GWAS_gt_genes, 
                   PD_GWAS_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "Caudate.resids.rds")))

8 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks
fromGeneName2Ensembl("GIN1")
## [1] "ENSG00000263031"
fromGeneName2Ensembl("HLA-DQB1")
## [1] "ENSG00000233209"
fromGeneName2Ensembl("HLA-DRA")
## [1] "ENSG00000228987"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"
fromGeneName2Ensembl("PPIP5K2")
## [1] "ENSG00000262234"
fromGeneName2Ensembl("VKORC1")
## [1] "ENSG00000255439"
fromGeneName2Ensembl("ZNF668")
## [1] "ENSG00000232748"
fromGeneName2Ensembl("ZSCAN16-AS1")
## [1] "ZSCAN16-AS1"
# Return gene name that is attached to the correct ENSG of this gene
fromEnsembl2GeneName("ENSG00000269293")
## [1] "RP1-265C24.9"
# Replace this in the search list
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="ZSCAN16-AS1"] <- "RP1-265C24.9"

One of the above genes, ZSCAN16-AS1, could not be identified through the method of searching fromGeneName2Ensembl, but the ENSG was found instead using fromEnsembl2GeneName to be assigned to the gene name RP1-265C24.9, so this was replaced in the input list. The remaining genes did not appear in the source files under any ENSG from ensembl, thus it was assumed these genes do not appear in this dataset. A quick check on GTEx itself for these genes displayed very low levels in the frontal cortex. The search was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_GWAS_report2_gt <- reportOnGenes(tissue = "Caudate", 
                                    genes = PD_GWAS_gt_genes, 
                                    which.one = "gtexv6", 
                                    ens.correction = c("ENSG00000263031", "ENSG00000145723",
                                                       "ENSG00000233209", "ENSG00000179344",
                                                       "ENSG00000228987", "ENSG00000204287",
                                                       "ENSG00000257437", "ENSG00000186868",
                                                       "ENSG00000262234", "ENSG00000145725", 
                                                       "ENSG00000255439", "ENSG00000167397",
                                                       "ENSG00000232748", "ENSG00000167394"))
# Select report
PD_GWAS_rep2_gt <- PD_GWAS_report2_gt$report 
# Count number of genes in output
PD_GWAS_rep2_gt_length <- nrow(PD_GWAS_rep2_gt)

The number of genes in the final search output was 139, showing there were 11 fewer genes than the original input list.

# Convert report into dataframe
PD_GWAS_rep2_gt <- data.frame(matrix(unlist(PD_GWAS_rep2_gt), 
                                     nrow = PD_GWAS_rep2_gt_length, 
                                     byrow = FALSE), 
                              stringsAsFactors=FALSE)
names(PD_GWAS_rep2_gt)[1] <- "Gene"
names(PD_GWAS_rep2_gt)[2] <- "ENSG"
names(PD_GWAS_rep2_gt)[3] <- "MM"
names(PD_GWAS_rep2_gt)[4] <- "Module"
names(PD_GWAS_rep2_gt)[5] <- "Size"
names(PD_GWAS_rep2_gt)[6] <- "GO_report"
names(PD_GWAS_rep2_gt)[7] <- "PD_genes"
names(PD_GWAS_rep2_gt)[8] <- "Preservation"
names(PD_GWAS_rep2_gt)[9] <- "Cell_type"
names(PD_GWAS_rep2_gt)[10] <- "Fisher"
# Filter for NSL modules
PD_GWAS_NSL_mod_gt <- PD_GWAS_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes column
PD_GWAS_NSL_mod_gt[, c(1,4,3,5,10,9)]

Gene ontology terms

To examine the functional characteristics of these modules, the GO term enrichments annotated to these modules were extracted and simplified according to parent terms.

# Obtain gprofiler2 annotations from the CoExpGTEx frontal cortex source files
e = read.csv(file.path(here::here("data", 
                                  "CoExp_source", 
                                  "netCaudate.9.it.50.rds_gprof.csv")), 
             stringsAsFactors = FALSE)
# Extract all GO term enrichments from source file for modules of interest
NSL_GO = e[e$query.number %in% NSL_gt_modules & e$domain %in% c("BP", "CC", "MF"),]
NSL_GO_2 <- dplyr::select(NSL_GO, query.number, term.id, term.name, domain, p.value)
# Rename columns
names(NSL_GO_2)[names(NSL_GO_2)=="query.number"] <- "gene_list"
names(NSL_GO_2)[names(NSL_GO_2)=="term.id"] <- "go_id"
names(NSL_GO_2)[names(NSL_GO_2)=="domain"] <- "go_type"
names(NSL_GO_2)[names(NSL_GO_2)=="term.name"] <- "go_term"
# FDR correct p values
NSL_GO_2$p.value <- as.numeric(NSL_GO_2$p.value)
NSL_GO_2$FDR <- p.adjust(NSL_GO_2$p.value, method="fdr")
# GO reduce all terms to parent terms
NSL_GO_red <- go_reduce(pathway_df = NSL_GO_2, 
                        threshold = 0.7,
                        scores = NULL)
# Format
NSL_GO_red$go_type[NSL_GO_red$go_type=="BP"] <- "GO: BP"
NSL_GO_red$go_type[NSL_GO_red$go_type=="MF"] <- "GO: MF"
NSL_GO_red$go_type[NSL_GO_red$go_type=="CC"] <- "GO: CC"
NSL_GO_red <- NSL_GO_red %>%
  tidyr::drop_na(parent_term) %>%
  dplyr::arrange(desc(FDR))
# Plot
ggplot2::ggplot(NSL_GO_red, aes(x = gene_list, y = parent_term)) + 
  geom_point(size = 2, aes(colour = -log10(`FDR`))) +
  scale_color_continuous(low = "lightgoldenrod", high = "red4", limits = c(0,58)) +
  facet_grid(go_type ~., scales = "free", space = "free") +
  labs(x = "Module", y = "") +
  theme(
    strip.text.y = element_text(angle = 90, size = 6),
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm")
    )

4.2. Summary

The frequency of genes within each gene list apppearing in each of the 6 NSL enriched modules was calculated.

# Collate module results into one table
gt1 <- PD_mendelian_NSL_mod_gt['Module']
gt1['Gene_list'] <- "PD_mendelian"
gt2 <- PD_GWAS_NSL_mod_gt['Module']
gt2['Gene_list'] <- "PD_GWAS"
gt_mods_list <- do.call(rbind, list(gt1, gt2))
# Display as frequency of each module within each list
gt_mod_freq <- with(gt_mods_list, table(Module, Gene_list))
gt_mod_freq <- as.data.frame.matrix(gt_mod_freq)
knitr::kable(gt_mod_freq)
PD_GWAS PD_mendelian
blue 10 8
darkred 8 3
lightcyan 7 0
lightgreen 7 1
midnightblue 9 0
salmon 16 5

The significance of the enrichments of each gene list within each of the 6 NSL enriched modules was also calculated. P-values were FDR-corrected before summarising.

# FDR correct p values
PD_mendelian_rep2_gt$FDR <- p.adjust(PD_mendelian_rep2_gt$Fisher, method = "fdr")
PD_GWAS_rep2_gt$FDR <- p.adjust(PD_GWAS_rep2_gt$Fisher, method = "fdr")
# Collate module results into one table
gts1 <- data.frame(PD_mendelian_rep2_gt$Module, PD_mendelian_rep2_gt$FDR)
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.FDR"] <- "FDR"
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.Module"] <- "Module"
gts1$Gene_list <- "PD_mendelian"
gts2 <- data.frame(PD_GWAS_rep2_gt$Module, PD_GWAS_rep2_gt$FDR)
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.FDR"] <- "FDR"
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.Module"] <- "Module"
gts2$Gene_list <- "PD_GWAS"
gt_pvals_list <- dplyr::bind_rows(gts1, gts2)
# Display as true/false table
gt_TF <- gt_pvals_list %>% 
  dplyr::distinct() %>%
  tidyr::spread(Gene_list, FDR)
gt_TF <- dplyr::filter(gt_TF, Module %in% NSL_gt_modules)
gt_TF <- tibble::column_to_rownames(gt_TF, var = "Module")
knitr::kable(gt_TF)
PD_GWAS PD_mendelian
blue 0.9740296 0.14091
darkred 0.9740296 0.60588
lightcyan 0.8326762 NA
lightgreen 0.9740296 0.94630
midnightblue 0.7074558 NA
salmon 0.0851691 0.27820
# Filter out modules without NSL genes
melt_summ <- gt_pvals_list %>%
  dplyr::filter(Module %in% NSL_gt_modules) %>%
  dplyr::distinct()
# Prep for figure 
melt_summ$Gene_list[melt_summ$Gene_list=="PD_mendelian"] <- "Mendelian PD"
melt_summ$Gene_list[melt_summ$Gene_list=="PD_GWAS"] <- "Sporadic PD"
melt_summ$FDR <- as.numeric(melt_summ$FDR)
melt_summ$Module <- as.character(melt_summ$Module)
melt_summ$Gene_list <- as.character(melt_summ$Gene_list)
melt_summ <- dplyr::arrange(melt_summ, Module)
melt_summ$`-log10(FDR)` <- -log10(melt_summ$FDR)
# Plot
ggplot(melt_summ, aes(Gene_list, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "lightsteelblue4", limits = c(0, 2.4)) +
  labs(y = "Module", x = "Gene list") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

5. Putamen gene co-expression network

5.1. Network structure

The overall structure of the network was first examined.

# Display all modules in a specified tissue/network 
CoExpNets::plotModSizes(tissue = "Putamen",
                        which.one = "gtexv6")

# Display network structure
Eigengenes <- getNetworkEigengenes(which.one = "gtexv6", 
                                   tissue = "Putamen")
cat("Putamen network was obtained from", nrow(Eigengenes),
    "samples and has", ncol(Eigengenes), "modules\n")
## Putamen network was obtained from 97 samples and has 35 modules
plotEGClustering(which.one = "gtexv6", 
                 tissue = "Putamen")

# Display correlations between modules
corrplot(cor(Eigengenes), tl.srt = 45,
         tl.col = "black",
         type = "upper", 
         tl.cex = 0.45,
         order = "hclust",
         title = paste0("Eigengene correlations"),
         mar = c(3, 2, 2.5, 4))

5.2. Gene set enrichment analysis

NSL genes

First, the NSL genes were searched in the coexpression network.

# Search genes of interest in specified tissue/network, including previously investigated KANSL1 ens.correction
NSL_report_gt <- reportOnGenes(tissue = "Putamen", 
                               genes = NSL_genes, 
                               which.one = "gtexv6", 
                               ens.correction = c("ENSG00000257382", "ENSG00000120071"))
# Select report
NSL_rep_gt <- NSL_report_gt$report 
# Convert report into dataframe
NSL_rep_gt_length <- nrow(NSL_rep_gt)
NSL_rep_gt <- data.frame(matrix(unlist(NSL_rep_gt), 
                                nrow = NSL_rep_gt_length, 
                                byrow = FALSE), stringsAsFactors=FALSE)
names(NSL_rep_gt)[1] <- "Gene"
names(NSL_rep_gt)[2] <- "ENSG"
names(NSL_rep_gt)[3] <- "MM"
names(NSL_rep_gt)[4] <- "Module"
names(NSL_rep_gt)[5] <- "Size"
names(NSL_rep_gt)[6] <- "GO_report"
names(NSL_rep_gt)[7] <- "PD_genes"
names(NSL_rep_gt)[8] <- "Preservation"
names(NSL_rep_gt)[9] <- "Cell_type"
names(NSL_rep_gt)[10] <- "Fisher"
# FDR correct p values
NSL_rep_gt$Fisher <- as.numeric(NSL_rep_gt$Fisher)
NSL_rep_gt$FDR <- p.adjust(NSL_rep_gt$Fisher, method="fdr")

The number of NSL genes is 9 and the number of genes in the search output was 9, suggesting there were 0 genes missing from the search.

# Print report without GO term and PD genes columns
NSL_rep_gt[c(1,4,3,5,10,11,9)]
# Store NSL enriched modules
NSL_gt_modules <- unique(NSL_rep_gt$Module)
# Select columns needed for figure
NSL_fig_gt <- data.frame(NSL_rep_gt$Module, NSL_rep_gt$Gene, NSL_rep_gt$FDR)
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.FDR"] <- "FDR"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Module"] <- "Module"
names(NSL_fig_gt)[names(NSL_fig_gt)=="NSL_rep_gt.Gene"] <- "Gene"
# Prep for figure 
NSL_fig_gt$FDR <- as.numeric(NSL_fig_gt$FDR)
NSL_fig_gt$Module <- as.character(NSL_fig_gt$Module)
NSL_fig_gt <- dplyr::arrange(NSL_fig_gt, Module)
NSL_fig_gt$`-log10(FDR)` <- -log10(NSL_fig_gt$FDR)
# Produce plot
ggplot(NSL_fig_gt, aes(Gene, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "goldenrod3", limits = c(0,2.4)) +
  labs(y = "Module") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

The NSL genes appeared in modules darkred, lightgreen, magenta, pink, lightcyan1, salmon, royalblue, black in this GCN but failed to reach significance in any module. The eigengenes plot shows modules darkred and lightcyan1 to be relatively close together, as well as modules magenta, royalblue and lightgreen, suggesting the genes within these groups of modules may have some level of coexpression.

PD mendelian genes

Then the PD mendelian list was searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network 
PD_mendelian_report_gt <- reportOnGenes(tissue = "Putamen", 
                                        genes = PD_mendelian_genes, 
                                        which.one = "gtexv6")
# Select report
PD_mendelian_rep_gt <- PD_mendelian_report_gt$report 
# Count number of genes in output
PD_mendelian_rep_gt_length <- nrow(PD_mendelian_rep_gt)

The number of PD mendelian genes is 43 and the number of genes in the search output was 39, suggesting there were 4 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_mendelian_genes, PD_mendelian_rep_gt$gene)
## [1] "GBA"    "MAPT"   "PRKN"   "SLC6A3"
# Check if any synonyms of the genes are present in networks
check_synonyms("Putamen", 
               "gtexv6", 
               PD_mendelian_genes, 
               PD_mendelian_rep_gt$gene)

1 gene appeared in the network under an alias name, so this was replaced in the search list.

# Replace this in the list of genes
PD_mendelian_gt_genes <- as.character(PD_mendelian_genes)
PD_mendelian_gt_genes[PD_mendelian_gt_genes=="PRKN"] <- "PARK2"
# If genes still missing, check if ENSGs appear in source files
check_source_files("Putamen", 
                   "gtexv6", 
                   PD_mendelian_gt_genes, 
                   PD_mendelian_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "Putamen.resids.rds")))

2 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks 
fromGeneName2Ensembl("GBA")
## [1] "ENSG00000262446"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"

The remaining gene did not appear in the source files under any ENSG from ensembl, thus it was assumed this gene does not appear in this dataset. The search can was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_mendelian_report2_gt <- reportOnGenes(tissue = "Putamen", 
                                         genes = PD_mendelian_gt_genes, 
                                         which.one = "gtexv6", 
                                         ens.correction = c("ENSG00000262446", "ENSG00000177628",
                                                            "ENSG00000257437", "ENSG00000186868"))
# Select report
PD_mendelian_rep2_gt <- PD_mendelian_report2_gt$report 
# Count number of genes in output
PD_mendelian_rep2_gt_length <- nrow(PD_mendelian_rep2_gt)

The number of genes in the final search output was 42, showing there was 1 fewer gene than the original input list.

# Convert report into dataframe
PD_mendelian_rep2_gt <- data.frame(matrix(unlist(PD_mendelian_rep2_gt), 
                                          nrow = PD_mendelian_rep2_gt_length, 
                                          byrow = FALSE), stringsAsFactors=FALSE)
names(PD_mendelian_rep2_gt)[1] <- "Gene"
names(PD_mendelian_rep2_gt)[2] <- "ENSG"
names(PD_mendelian_rep2_gt)[3] <- "MM"
names(PD_mendelian_rep2_gt)[4] <- "Module"
names(PD_mendelian_rep2_gt)[5] <- "Size"
names(PD_mendelian_rep2_gt)[6] <- "GO_report"
names(PD_mendelian_rep2_gt)[7] <- "PD_genes"
names(PD_mendelian_rep2_gt)[8] <- "Preservation"
names(PD_mendelian_rep2_gt)[9] <- "Cell_type"
names(PD_mendelian_rep2_gt)[10] <- "Fisher"
# Filter for NSL gene enriched modules
PD_mendelian_NSL_mod_gt <- PD_mendelian_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes columns
PD_mendelian_NSL_mod_gt[, c(1,4,3,5,10,9)]

PD sporadic genes

The PD sporadic list was also searched and the results filtered for the NSL enriched modules.

# Search genes of interest in specified tissue/network
PD_GWAS_report_gt <- reportOnGenes(tissue = "Putamen",
                                   genes = PD_GWAS_genes, 
                                   which.one = "gtexv6")
# Select report
PD_GWAS_rep_gt <- PD_GWAS_report_gt$report 
# Count number of genes in output
PD_GWAS_rep_gt_length <- nrow(PD_GWAS_rep_gt)

The number of PD GWAS genes is 150 and the number of genes in the search output was 125, suggesting there were 25 genes missing from the search which needed to be identified.

# Show missing genes
dplyr::setdiff(PD_GWAS_genes, PD_GWAS_rep_gt$gene)
##  [1] "CCAR2"       "CSTA"        "GIN1"        "HIST1H4J"    "HLA-DQA1"   
##  [6] "HLA-DQA2"    "HLA-DQB1"    "HLA-DRA"     "HLA-DRB6"    "KLHL7-DT"   
## [11] "MAPT"        "MMRN1"       "NOD2"        "PPIP5K2"     "RAB29"      
## [16] "RETREG3"     "SGF29"       "SLC4A1"      "TRIM10"      "TRIM31"     
## [21] "TRIM40"      "VKORC1"      "WDR5B"       "ZNF668"      "ZSCAN16-AS1"
# Check if any synonyms of the genes are present in networks
check_synonyms("Putamen", 
               "gtexv6",
               PD_GWAS_genes, 
               PD_GWAS_rep_gt$gene)

5 genes appeared in the network under alias names, so these were replaced in the search list.

# Replace this in the list of genes
PD_GWAS_gt_genes <- as.character(PD_GWAS_genes)
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="CCAR2"] <- "KIAA1967"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="KLHL7-DT"] <- "KLHL7-AS1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RAB29"] <- "RAB7L1"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="RETREG3"] <- "FAM134C"
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="SGF29"] <- "CCDC101"
# If genes still missing, check if ENSGs appear in source files
check_source_files("Putamen", 
                   "gtexv6", 
                   PD_GWAS_gt_genes, 
                   PD_GWAS_rep_gt$gene, 
                   file.path(here::here("data", "CoExp_source", "Putamen.resids.rds")))

8 more of the missing genes appeared in the network under these ENSGs, so an ens.correction was needed.

# Return ENSGs that are in the networks
fromGeneName2Ensembl("GIN1")
## [1] "ENSG00000263031"
fromGeneName2Ensembl("HLA-DQB1")
## [1] "ENSG00000233209"
fromGeneName2Ensembl("HLA-DRA")
## [1] "ENSG00000228987"
fromGeneName2Ensembl("MAPT")
## [1] "ENSG00000257437"
fromGeneName2Ensembl("PPIP5K2")
## [1] "ENSG00000262234"
fromGeneName2Ensembl("VKORC1")
## [1] "ENSG00000255439"
fromGeneName2Ensembl("ZNF668")
## [1] "ENSG00000232748"
fromGeneName2Ensembl("ZSCAN16-AS1")
## [1] "ZSCAN16-AS1"
# Return gene name that is attached to the correct ENSG of this gene
fromEnsembl2GeneName("ENSG00000269293")
## [1] "RP1-265C24.9"
# Replace this in the search list
PD_GWAS_gt_genes[PD_GWAS_gt_genes=="ZSCAN16-AS1"] <- "RP1-265C24.9"

One of the above genes, ZSCAN16-AS1, could not be identified through the method of searching fromGeneName2Ensembl, but the ENSG was found instead using fromEnsembl2GeneName to be assigned to the gene name RP1-265C24.9, so this was replaced in the input list. The remaining genes did not appear in the source files under any ENSG from ensembl, thus it was assumed these genes do not appear in this dataset. A quick check on GTEx itself for these genes displayed very low levels in the frontal cortex. The search was then repeated with the most complete gene list possible.

# Again search genes of interest in specified tissue/network 
PD_GWAS_report2_gt <- reportOnGenes(tissue = "Putamen", 
                                    genes = PD_GWAS_gt_genes, 
                                    which.one = "gtexv6", 
                                    ens.correction = c("ENSG00000263031", "ENSG00000145723",
                                                       "ENSG00000233209", "ENSG00000179344",
                                                       "ENSG00000228987", "ENSG00000204287",
                                                       "ENSG00000257437", "ENSG00000186868",
                                                       "ENSG00000262234", "ENSG00000145725", 
                                                       "ENSG00000255439", "ENSG00000167397",
                                                       "ENSG00000232748", "ENSG00000167394"))
# Select report
PD_GWAS_rep2_gt <- PD_GWAS_report2_gt$report 
# Count number of genes in output
PD_GWAS_rep2_gt_length <- nrow(PD_GWAS_rep2_gt)

The number of genes in the final search output was 138, showing there were 12 fewer genes than the original input list.

# Convert report into dataframe
PD_GWAS_rep2_gt <- data.frame(matrix(unlist(PD_GWAS_rep2_gt), 
                                     nrow = PD_GWAS_rep2_gt_length, 
                                     byrow = FALSE), 
                              stringsAsFactors=FALSE)
names(PD_GWAS_rep2_gt)[1] <- "Gene"
names(PD_GWAS_rep2_gt)[2] <- "ENSG"
names(PD_GWAS_rep2_gt)[3] <- "MM"
names(PD_GWAS_rep2_gt)[4] <- "Module"
names(PD_GWAS_rep2_gt)[5] <- "Size"
names(PD_GWAS_rep2_gt)[6] <- "GO_report"
names(PD_GWAS_rep2_gt)[7] <- "PD_genes"
names(PD_GWAS_rep2_gt)[8] <- "Preservation"
names(PD_GWAS_rep2_gt)[9] <- "Cell_type"
names(PD_GWAS_rep2_gt)[10] <- "Fisher"
# Filter for NSL modules
PD_GWAS_NSL_mod_gt <- PD_GWAS_rep2_gt %>%
  filter(Module %in% NSL_gt_modules)
# Print report without GO term and PD genes column
PD_GWAS_NSL_mod_gt[, c(1,4,3,5,10,9)]

Gene ontology terms

To examine the functional characteristics of these modules, the GO term enrichments annotated to these modules were extracted and simplified according to parent terms.

# Obtain gprofiler2 annotations from the CoExpGTEx frontal cortex source files
e = read.csv(file.path(here::here("data", 
                                  "CoExp_source", 
                                  "netPutamen.21.it.50.rds_gprof.csv")), 
             stringsAsFactors = FALSE)
# Extract all GO term enrichments from source file for modules of interest
NSL_GO = e[e$query.number %in% NSL_gt_modules & e$domain %in% c("BP", "CC", "MF"),]
NSL_GO_2 <- dplyr::select(NSL_GO, query.number, term.id, term.name, domain, p.value)
# Rename columns
names(NSL_GO_2)[names(NSL_GO_2)=="query.number"] <- "gene_list"
names(NSL_GO_2)[names(NSL_GO_2)=="term.id"] <- "go_id"
names(NSL_GO_2)[names(NSL_GO_2)=="domain"] <- "go_type"
names(NSL_GO_2)[names(NSL_GO_2)=="term.name"] <- "go_term"
# FDR correct p values
NSL_GO_2$p.value <- as.numeric(NSL_GO_2$p.value)
NSL_GO_2$FDR <- p.adjust(NSL_GO_2$p.value, method="fdr")
# GO reduce all terms to parent terms
NSL_GO_red <- go_reduce(pathway_df = NSL_GO_2, 
                        threshold = 0.7,
                        scores = NULL)
# Format
NSL_GO_red$go_type[NSL_GO_red$go_type=="BP"] <- "GO: BP"
NSL_GO_red$go_type[NSL_GO_red$go_type=="MF"] <- "GO: MF"
NSL_GO_red$go_type[NSL_GO_red$go_type=="CC"] <- "GO: CC"
NSL_GO_red <- NSL_GO_red %>%
  tidyr::drop_na(parent_term) %>%
  dplyr::arrange(desc(FDR))
# Plot
ggplot2::ggplot(NSL_GO_red, aes(x = gene_list, y = parent_term)) + 
  geom_point(size = 2, aes(colour = -log10(`FDR`))) +
  scale_color_continuous(low = "lightgoldenrod", high = "red4", limits = c(0,58)) +
  facet_grid(go_type ~., scales = "free", space = "free") +
  labs(x = "Module", y = "") +
  theme(
    strip.text.y = element_text(angle = 90, size = 6),
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm")
    )

5.2. Summary

The frequency of genes within each gene list apppearing in each of the 8 NSL enriched modules was calculated.

# Collate module results into one table
gt1 <- PD_mendelian_NSL_mod_gt['Module']
gt1['Gene_list'] <- "PD_mendelian"
gt2 <- PD_GWAS_NSL_mod_gt['Module']
gt2['Gene_list'] <- "PD_GWAS"
gt_mods_list <- do.call(rbind, list(gt1, gt2))
# Display as frequency of each module within each list
gt_mod_freq <- with(gt_mods_list, table(Module, Gene_list))
gt_mod_freq <- as.data.frame.matrix(gt_mod_freq)
knitr::kable(gt_mod_freq)
PD_GWAS PD_mendelian
black 6 3
darkred 17 6
lightcyan1 10 2
lightgreen 6 1
magenta 6 0
pink 5 1
royalblue 5 2
salmon 5 2

The significance of the enrichments of each gene list within each of the 8 NSL enriched modules was also calculated. P-values were FDR-corrected before summarising.

# FDR correct p values
PD_mendelian_rep2_gt$FDR <- p.adjust(PD_mendelian_rep2_gt$Fisher, method = "fdr")
PD_GWAS_rep2_gt$FDR <- p.adjust(PD_GWAS_rep2_gt$Fisher, method = "fdr")
# Collate module results into one table
gts1 <- data.frame(PD_mendelian_rep2_gt$Module, PD_mendelian_rep2_gt$FDR)
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.FDR"] <- "FDR"
names(gts1)[names(gts1)=="PD_mendelian_rep2_gt.Module"] <- "Module"
gts1$Gene_list <- "PD_mendelian"
gts2 <- data.frame(PD_GWAS_rep2_gt$Module, PD_GWAS_rep2_gt$FDR)
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.FDR"] <- "FDR"
names(gts2)[names(gts2)=="PD_GWAS_rep2_gt.Module"] <- "Module"
gts2$Gene_list <- "PD_GWAS"
gt_pvals_list <- dplyr::bind_rows(gts1, gts2)
# Display as true/false table
gt_TF <- gt_pvals_list %>% 
  dplyr::distinct() %>%
  tidyr::spread(Gene_list, FDR)
gt_TF <- dplyr::filter(gt_TF, Module %in% NSL_gt_modules)
gt_TF <- tibble::column_to_rownames(gt_TF, var = "Module")
knitr::kable(gt_TF)
PD_GWAS PD_mendelian
black 0.8266731 0.5102160
darkred 0.0077613 0.0708867
lightcyan1 0.1956255 0.7208069
lightgreen 0.6422447 0.8135707
magenta 0.6676171 NA
pink 0.8644270 0.8362000
royalblue 0.8732031 0.7316167
salmon 0.8732031 0.7316167
# Filter out modules without NSL genes
melt_summ <- gt_pvals_list %>%
  dplyr::filter(Module %in% NSL_gt_modules) %>%
  dplyr::distinct()
# Prep for figure 
melt_summ$Gene_list[melt_summ$Gene_list=="PD_mendelian"] <- "Mendelian PD"
melt_summ$Gene_list[melt_summ$Gene_list=="PD_GWAS"] <- "Sporadic PD"
melt_summ$FDR <- as.numeric(melt_summ$FDR)
melt_summ$Module <- as.character(melt_summ$Module)
melt_summ$Gene_list <- as.character(melt_summ$Gene_list)
melt_summ <- dplyr::arrange(melt_summ, Module)
melt_summ$`-log10(FDR)` <- -log10(melt_summ$FDR)
# Plot
ggplot(melt_summ, aes(Gene_list, Module)) +
  geom_tile(aes(fill = `-log10(FDR)`)) +
  geom_text(aes(label = signif.num(FDR))) +
  scale_fill_gradient(low = "white", high = "lightsteelblue4", limits = c(0, 2.4)) +
  labs(y = "Module", x = "Gene list") +
  theme(
    axis.text = element_text(size = 6),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 8),
    legend.key.size = unit(4, "mm"))
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.